/* goes with the lincense of org.bytedeco.javacpp
 * (c) kaba
 */

import org.bytedeco.javacpp.BytePointer;
import org.bytedeco.javacpp.IntPointer;
import org.bytedeco.javacpp.LongPointer;
import org.bytedeco.javacpp.PointerPointer;
import org.bytedeco.javacpp.cuda;

/**
 *
 * @author kaba
 */
public class VectorAddDrv
{
    private static int N = 1000000;

    public static void main(String[] args) {
        int[] a=new int[N], b=new int[N], c=new int[N];  // array in host memory
        long[] dev_a={0}, dev_b={0}, dev_c={0};  // "pointers" to device memory

        // init the device
        cuda.cuInit(0);
        int[] cudaDevice = {0};
        cuda.cuDeviceGet(cudaDevice,0);

        // obtain a context
        cuda.CUctx_st context = new cuda.CUctx_st();
        cuda.cuCtxCreate( context, 0, cudaDevice[0] );

        // allocate device memory
        cuda.cuMemAlloc( dev_a, N * Integer.BYTES );
        cuda.cuMemAlloc( dev_b, N * Integer.BYTES );
        cuda.cuMemAlloc( dev_c, N * Integer.BYTES );

        // prepare host arrays
        for( int i=0; i<N; i++) {
            a[i] = -i;
            b[i] = i*i;
        }
        a[N-3]=1;  // introduce error to be found; to show that check works

        // copy input host (heap) arrays to native memory to device memory
        IntPointer nat_a = new IntPointer( a );  // host->nat
        cuda.cuMemcpyHtoD( dev_a[0], nat_a, a.length*Integer.BYTES );  // nat->dev
        IntPointer nat_b = new IntPointer( b );  // host->nat
        cuda.cuMemcpyHtoD( dev_b[0], nat_b, b.length*Integer.BYTES );  // nat->dev

        // prepare the kernel - module and function
        cuda.CUmod_st module = new cuda.CUmod_st();
        cuda.cuModuleLoadData( module, new BytePointer(VECTOR_ADD_PTX) );
        cuda.CUfunc_st vector_add = new cuda.CUfunc_st();
        cuda.cuModuleGetFunction( vector_add, module, "vector_add" );

        // prepare kernel parameters
        PointerPointer kernelParameters = new PointerPointer(
            new IntPointer( new int[]{N} ),
            new LongPointer( dev_a ),
            new LongPointer( dev_b ),
            new LongPointer( dev_c ) );

        // run the kernel
        cuda.cuLaunchKernel( vector_add,
                         (N+255)/256, 1, 1, // Grid dimension
                         256, 1, 1, // Block dimension - all GPUs should manage > 256 threads per block
                         0, null, // Shared memory size and stream
                         kernelParameters, null ); // Kernel- and extra parameters

        cuda.cuCtxSynchronize();

        // copy output device array to native memory to host (heap) memory
        IntPointer nat_c = new IntPointer( c.length );
        cuda.cuMemcpyDtoH( nat_c, dev_c[0], c.length*Integer.BYTES );  // dev->nat
        nat_c.get( c );  // nat->host

        // check results
        boolean ok = true;
        for( int i=0; i<N; i++) {
            if( c[i] != (i*i)-i ) {
                ok = false;
                System.out.println( "result incorrect at position "+i+": expected "+((i*i)-i)+", but got "+c[i] );
            }
        }
        if(ok)
            System.out.println( "calculation succeded" );
    }

/** PTX module with CUDA function vector add, taking a size and three arrays.
 * c.f. https://devblogs.nvidia.com/even-easier-introduction-cuda/
 *
 * workflow (there are others):
 * the following CUDA function must be compiled manually to ptx, which must then be copied
 * into the String constant below
 * compile command: nvcc -ptx vector_add.cu -o vector_add.ptx
 * NB: remember to fix comment closing marks before compiling
 * /
extern "C"
__global__ void vector_add(int n, int *a, int *b, int *c)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for( int i = index; i < n; i+= stride )
    c[i] = a[i] + b[i];
}
*/
    private final static String VECTOR_ADD_PTX=
"//\n" +
"// Generated by NVIDIA NVVM Compiler\n" +
"//\n" +
"// Compiler Build ID: CL-23083092\n" +
"// Cuda compilation tools, release 9.1, V9.1.85\n" +
"// Based on LLVM 3.4svn\n" +
"//\n" +
"\n" +
".version 6.1\n" +
".target sm_30\n" +
".address_size 64\n" +
"\n" +
"	// .globl	vector_add\n" +
"\n" +
".visible .entry vector_add(\n" +
"	.param .u32 vector_add_param_0,\n" +
"	.param .u64 vector_add_param_1,\n" +
"	.param .u64 vector_add_param_2,\n" +
"	.param .u64 vector_add_param_3\n" +
")\n" +
"{\n" +
"	.reg .pred 	%p<3>;\n" +
"	.reg .b32 	%r<14>;\n" +
"	.reg .b64 	%rd<11>;\n" +
"\n" +
"\n" +
"	ld.param.u32 	%r5, [vector_add_param_0];\n" +
"	ld.param.u64 	%rd4, [vector_add_param_1];\n" +
"	ld.param.u64 	%rd5, [vector_add_param_2];\n" +
"	ld.param.u64 	%rd6, [vector_add_param_3];\n" +
"	cvta.to.global.u64 	%rd1, %rd6;\n" +
"	cvta.to.global.u64 	%rd2, %rd5;\n" +
"	cvta.to.global.u64 	%rd3, %rd4;\n" +
"	mov.u32 	%r6, %ntid.x;\n" +
"	mov.u32 	%r7, %ctaid.x;\n" +
"	mov.u32 	%r8, %tid.x;\n" +
"	mad.lo.s32 	%r13, %r6, %r7, %r8;\n" +
"	mov.u32 	%r9, %nctaid.x;\n" +
"	mul.lo.s32 	%r2, %r9, %r6;\n" +
"	setp.ge.s32	%p1, %r13, %r5;\n" +
"	@%p1 bra 	BB0_2;\n" +
"\n" +
"BB0_1:\n" +
"	mul.wide.s32 	%rd7, %r13, 4;\n" +
"	add.s64 	%rd8, %rd3, %rd7;\n" +
"	add.s64 	%rd9, %rd2, %rd7;\n" +
"	ld.global.u32 	%r10, [%rd9];\n" +
"	ld.global.u32 	%r11, [%rd8];\n" +
"	add.s32 	%r12, %r10, %r11;\n" +
"	add.s64 	%rd10, %rd1, %rd7;\n" +
"	st.global.u32 	[%rd10], %r12;\n" +
"	add.s32 	%r13, %r13, %r2;\n" +
"	setp.lt.s32	%p2, %r13, %r5;\n" +
"	@%p2 bra 	BB0_1;\n" +
"\n" +
"BB0_2:\n" +
"	ret;\n" +
"}";

}
